# 请在此处设置数据集路径和工作路径
export dataset_dir=/gemini/platform/public/datasets/wudao_test
export WORK_DIR=/gemini/code/

# 分别为训练集、验证集生成mmap格式预训练数据集
cd ${WORK_DIR}/Pai-Megatron-Patch/toolkits/pretrain_data_preprocessing
bash run_make_pretraining_dataset.sh \
../../ \
${dataset_dir}/cleaned_zst/ \
qwenbpe \
${dataset_dir}/pl_data/ \
/gemini/platform/public/llm/huggingface/Qwen/Qwen1.5-7B-Chat